---
title: 'Replication Date for: Do All Roads Lead to Sapporo? The Role of Linking and
  Bridging Ties in Evacuation Decisions'
author: "Timothy Fraser, Daniel P. Aldrich, Larissa Morikawa"
output:
  html_document:
    df_print: paged
---

This document outlines the code necessary to reproduce our statistical analyses.

# 0. Packages

```{r,  warning = FALSE, message = FALSE, warning = FALSE}
# Load Packages
library(tidyverse)
library(Zelig)
library(gridExtra)
library(readxl)
library(viridis)
library(texreg)
library(lmtest)
```


# 1. Data Preparation

## 1.1 Summarizing data from Japanese census


First, let's import raw census data, compile it, and export it as quake_data_processing.csv.

```{r,  warning = FALSE, message=FALSE, warning=FALSE}
bind_rows(
  
  # Import a select few variables from this dataset
  read_csv("estat_data_part_a.csv", skip = 8) %>%
    mutate_all(funs(as.double(str_remove_all(., "[,]")))),
  
  read_csv("estat_data_part_a2.csv", skip = 8) %>%
    mutate_all(funs(as.double(str_remove_all(., "[,]"))))
  ) %>%
  
  # Recode area code
  mutate(muni_code = `AREA Code` %>% str_pad(5, "left", "0")) %>%
  
  # Join in other data
  left_join(by = c("muni_code", "YEAR"),
            y = bind_rows(
              
              read_csv("estat_data_part_b.csv", skip = 8) %>%
                mutate_all(funs(as.double(str_remove_all(., "[,]")))),
              
              read_csv("estat_data_part_b2.csv", skip = 8) %>%
                mutate_all(funs(as.double(str_remove_all(., "[,]"))))
              ) %>%
              # Recode area code
              mutate(muni_code = `AREA Code` %>% str_pad(5, "left", "0"))
            ) %>%
  # Select and rename data
  select(muni_code,
         year = YEAR,
         pop = "A1101_Total population (Both sexes)[person]",
         exp_fire = "D320309_(Expenditure) Fire service (local public finance, municipalities)[thousand yen]",
         exp_dis_relief = "D320311_Disaster relief expenditure (municipalities)[thousand yen]",
         kindergartens =  "E1101_Number of kindergartens[number of kindergartens]",
         elementary = "E2101_Number of elementary schools[number of schools]",
         secondary = "E3101_Number of lower secondary schools[number of schools]",
         upper_secondary = "E4101_Number of upper secondary schools[number of schools]",
         public_halls = "G1201_No. of public halls[number of community centers]",
         libraries = "G1401_No. of libraries[number of community centers]",
         owned_homes = "H1310_No. of owned houses[dwellings]",
         rented_homes = "H1320_No. of rented houses[dwellings]",
         public_rented_homes = "H1321_No. of rented houses owned by local government, urban renaissance agency or public corporation[dwellings]",
         rented_homes_public_govt = "H132101_No. of rented houses owned by local government[dwellings]",
         rented_homes_public_not_govt = "H132102_No. of rented houses owned by urban renaissance agency or public corporation[dwellings]",
         rented_homes_private = "H1322_No. of rented houses owned privately[dwellings]",
         owned_homes_earthquake_resist_built = "H2270_No. of dwellings getting earthquake resistance construction (Owned)[dwellings]",                                                                   
         owned_homes_earthquake_resist_renov = "H227004_Total No. of owned houses renovated to make it earthquake-resistant in and after 2009[dwellings]") %>%
  
  
  # Join in a few additional variables
  left_join(
     # Join using municipality code and year
    by = c("muni_code", "year"),
    y = read_xlsx(path = "disaster_data.xlsx", sheet = "stats") %>%
      # repair 5-digit municipality code
      mutate(muni_code = muni_code %>% str_pad(5,"left","0")) %>%
      # select main variables
      select(year, muni_code,
             financial_strength_index = financial_strength_index_2016,
             rev_to_exp_ratio = rev_to_exp_ratio_2016,
             exp_public_works_thous = exp_public_works_thous_2016) %>%
      # Convert to numeric
      mutate_at(vars(-c(year, muni_code)), funs(as.numeric))
   ) %>%
  
  # Pivot to long form
  pivot_longer(cols = -c(muni_code, year),
               names_to = "measure",
               values_to = "value") %>%
  
  # Remove urban wards outside of Tokyo from data
  filter(muni_code %in% read_csv("muni_code.csv")$muni_code) %>%
  
  # Export to file
  write.csv(file = "quake_data_processing.csv", row.names = FALSE)
```


Next, we identify which of the variables we collected are missing less than 5% of data points in any year after 2010. Those which meet our criteria are eligible variables we can use for our analysis.

```{r,  warning = FALSE, message = FALSE, warning = FALSE}
# Identify variables missing less than 5% of data for a year
read_csv("quake_data_processing.csv") %>%
  group_by(measure, year) %>%
  # Count missing data, divided by total
  summarize(
    missing = sum(
      if_else(is.na(value), 1, 0)) / n()) %>%
  # Filter to just those missing less than 5%
  # and those pertaining to years during or after 2010
  filter(missing < 0.05 & year >= 2010) %>% ungroup %>%
  # Create variable name with year
  mutate(measure = paste(measure, year, sep = "_")) %>%
  select(measure)
```


Next, we want to create a smaller dataset of our eligible variables, merging our variables from *quake_data_processing.csv* into the list of 1741 municipalities.

```{r,  warning = FALSE, message = FALSE, warning = FALSE}

read_csv("muni_code.csv") %>%
  
  left_join(by = "muni_code",
            # Import in valid variables
            y = read_csv("quake_data_processing.csv") %>%
              pivot_wider(id_cols = muni_code,
                          names_from = c(measure, year),
                          values_from = value) %>%
              
              # Get most recent variables that were valid
              select(muni_code, pop_2015,
                     kindergartens_2017, elementary_2017, 
                     secondary_2017, upper_secondary_2017,
                     public_halls_2015, libraries_2015, 
                     exp_fire_2016, exp_dis_relief_2016,
                     exp_public_works_2016 = exp_public_works_thous_2016, # rename
                     financial_strength_index_2016,
                     rev_to_exp_ratio_2016) %>%
              
              rowwise() %>%
              mutate(
                # count total number of potential public shelters per thousand persons
                shelters_per_capita_2017 = sum(
                  kindergartens_2017, elementary_2017, secondary_2017, 
                  upper_secondary_2017, public_halls_2015, libraries_2015) / pop_2015 * 1000,
                # Get spending on emergencies, disasters, and public works per capita (in thousands of yen)
                exp_fire_per_capita_2016 = exp_fire_2016 / pop_2015,
                exp_dis_relief_per_capita_2016 = exp_dis_relief_2016 / pop_2015,
                exp_public_works_per_capita_2016 = exp_public_works_2016 / pop_2015)) %>%
  
  # We have created three more of our main control variables:
  # emergency services spending (proxy: expenditure of fire departments)
  # shelters per capita (proxy: schools and community centers)
  # population
  
  # Join in vulnerability and social capital data
  left_join(
    by = c("muni_code"),
    y = read_csv("indices.csv") %>%
  filter(year == 2017) %>%
  select(muni_code, social_capital, bonding, bridging, linking, vulnerability)) %>%

  # Export these to file
  write.csv(file = "quake_muni_data.csv", row.names = FALSE)

  

```



## 1.2. Summarizing government data on Iburi Earthquake

The Japanese government reported data in several different ways after the Eastern Iburi Earthquake. The cabinet reported water and communication failures due to the quake and resulting power outage. The prefectural government reported deaths, injuries, residences damaged, and other buildings damaged. And finally, the cabinet also reported which municipalities received evacuation orders, and how many evacuees went to each designated shelter. We summarize these numbers into a single table we can use for data analysis.

First, we grabbed a list of all municipalities in Hokkaido.
```{r,  warning = FALSE}
# Import like of municipality names and codes
muni <- read_xlsx("disaster_data.xlsx", sheet = "cities") %>%
  # repair 5-digit municipality code
  mutate(muni_code = muni_code %>% str_pad(5,"left","0")) %>%
  filter(muni_code %>% str_sub(1,2) %>% str_detect("01"))
```

Next, we identified the total number of deaths, injuries, homes damaged, and non-residential properties damaged.

```{r,  warning = FALSE}
# Add code to damages file
damages = read_xlsx("disaster_data.xlsx", sheet = "damages") %>%
  left_join(y = muni, by = "muni") %>%
  pivot_longer(
    cols = -c(muni, muni_code),
    names_to = "measure",
    values_to = "value") %>%
  # replace blanks with 0
  mutate(value = if_else(is.na(value), 0, value)) %>%
  # Now recode measures we will consolidate
  mutate(measure = measure %>% recode(
    "injured_serious" = "injured",
    "injured_moderate" = "injured", 
    "injured_minor" = "injured",
    "homes_destroyed" = "homes_damaged",
    "homes_partially_destroyed" = "homes_damaged",
    "homes_damaged" = "homes_damaged",
    "nonresidential_destroyed" = "nonresidential_damaged",
    "nonresidential_partially_destroyed" = "nonresidential_damaged",
    "nonresidential_damaged" = "nonresidential_damaged")) %>%
  # Aggregate all of the categories above together
  group_by(muni, muni_code, measure) %>%
  summarize(value = sum(value)) %>%
  # Pivot back out into wider format
  pivot_wider(
    id_cols = c(muni, muni_code),
    names_from = measure,
    values_from = value)

```

Next, we calculated the total cases of water outages per capita for each municipality.

```{r,  warning = FALSE, message = FALSE, warning = FALSE}
# Add code to water file
water = read_xlsx("disaster_data.xlsx", sheet = "water") %>%
  left_join(y = muni, by = "muni") %>%
  # Select key variables
  select(muni, muni_code, 
         outages_peak, outages_current, 
         outage_length_days, note) %>%
  # Convert all to numeric
  mutate_at(vars(-c(muni, muni_code, note)), funs(as.numeric)) %>%
  mutate_at(vars(-c(muni, muni_code, note)), funs(if_else(is.na(.), 0, .))) %>%
    # Join in population
  left_join(y = read_csv("quake_muni_data.csv") %>%
              select(muni_code, pop_2015),
            by = "muni_code") %>%
    # Calculate per capita how many cases of water outages occurred
  mutate(
    outages_peak_per_capita = outages_peak / pop_2015,
    outages_current_per_capita = outages_current / pop_2015
  )  %>%
  # If it mentioned aggregation (and therefore wasn't NA), 
  # divide by the number of cases which were aggregated (2)
  mutate_at(vars(c(outages_peak_per_capita, outages_current_per_capita)),
            funs(if_else(is.na(note), . / 2, .))) %>%
  select(-pop_2015, -note, -outages_peak, -outages_current)

```

Next, we calculated the total number of television service contracts per capita disconnected due to the quake or power outage.

```{r,  warning = FALSE}
# Add code to communication file
com <- read_xlsx("disaster_data.xlsx", sheet = "communication") %>%
  # Remove any instances of Hokkaido that you see
  mutate(muni = str_remove(muni, "北海道")) %>%
  # If a Sapporo city ward is listed, recode it with just the name Sapporo City
  mutate(muni = if_else(muni %>% str_detect("札幌市"),
                        muni %>% str_extract("札幌市"), muni)) %>% 
  left_join(y = muni, by = "muni") %>%
  # Convert all households to numeric, and keep only those which were not character responses
  mutate(households = as.numeric(households)) %>%
  filter(!is.na(households)) %>%
   # Join in population
  left_join(y = read_csv("quake_muni_data.csv") %>%
              select(muni_code, pop_2015),
            by = "muni_code")
```

```{r,  warning = FALSE}
# For some areas, the state reported aggregated data.
# To approximate the real number of households disconnected in each community,
# We will multiply the number of households affected in aggregate by the town's population,
# divided by the aggregate area's population, creating a population controlled measure for households.

com = com %>%
  left_join(by = c("type",
                   "station",
                   "households"),
            y = com %>%
              filter(note == "aggregated") %>%
              group_by(type, station, households) %>%
              summarize(
                # IF the note says aggregated,
                # calculate the total population that applied to the cases 
                # where the number of affected households was the same
                pop_2015_total = sum(pop_2015, na.rm = TRUE))) %>%
  # If note == "aggregated" (and therefore NOT NA)
  mutate(households = if_else(
    !is.na(note),
    true = households * pop_2015 / pop_2015_total,
    false = households)) %>%
  
  # How many households connections to TV stations were lost, per capita?
  # Where one household could have many connections
  # This would be a useful measure of *EXTREME* disconnection;
  # We would rank someone disconnected from ALL their TV services as more disconnected than
  # someone disconnected from just one or some of them.
  
  group_by(muni, muni_code) %>%
  # Calculate the sum of households, and divide it by the population in each town.
  # Note: because the same towns reoccur, taking the mean of population is just to 
  # bring it down to a single value for each town. 
  # It does not actually change the value of population at all.
  summarize( 
    households_disconnected_per_capita = sum(households, na.rm = TRUE) / mean(pop_2015))

com %>%
  filter(households_disconnected_per_capita > 0)
```

Next, we calculated how many shelters were opened per municipality to receive evacuees, and how many residents actually used those municipal shelters. 

```{r,  warning = FALSE, message = FALSE, warning = FALSE}
# Add code to evacuation shelters file
shelters = read_xlsx("disaster_data.xlsx", sheet = "shelters") %>%
  rename(muni = municipality) %>%
  left_join(y = muni, by = c("muni")) %>%
  # Replace any instances of NA with 0 (because in this case, it really does mean zero)
  mutate(evacuees = if_else(is.na(evacuees), 0, evacuees)) %>%
  # Select just relevant variables
  select(muni, muni_code, evacuation_shelter, open_date, evacuees) %>%
  # For each town,
  group_by(muni, muni_code) %>%
  summarize(
    # # how many shelters were opened?
    shelters_opened = n())
```


```{r,  warning = FALSE}

people = read_xlsx("disaster_data.xlsx", sheet = "evacuation") %>%
  left_join(y = muni, by = c("muni"))


```

The number that used shelters can be seen here per town.

```{r,  warning = FALSE}
# Visualize how many persons evacuated to town shelters
people %>%
  filter(evacuees > 0) %>% ungroup %>%
  mutate(muni = muni %>% recode(
    "北広島市" = "Kita-Hiroshima City",
    "札幌市" = "Sapporo City",
    "むかわ町" = "Mukawa Town",
    "安平町" = "Abira Town",
    "厚真町" = "Atsuma Town",
    "恵庭市" = "Eniwa City",
    "平取町" = "Biratori Town",
    "日高町" = "Hidaka Town",
    "苫小牧市" = "Tomakomai City")) %>%
  ggplot(mapping = aes(x = date, y = evacuees, color = muni, group = muni)) +
   geom_line(size = 1.5) +
  geom_point(size = 3) +
  geom_point(size = 2, color = "white", show.legend = TRUE) +
  theme_minimal() +
  scale_color_viridis(option = "viridis", begin = 0.1, end = 0.9, discrete = TRUE) +
  labs(x = "Week", y = "Persons evacuated to Municipal Shelters",
       color = "Municipality",
       title = "Evacuation to Municipal Shelters after Earthquake")
  
  
#  ggplot(mapping = aes(x = reorder(muni, evacuees), y = evacuees, fill = muni)) +
#  geom_col() + coord_flip() +
#  theme_minimal() +
#  facet_wrap(~date) +
#  scale_fill_viridis(option = "viridis", begin = 0.1, end = 0.9, discrete = TRUE) +
#  labs(x = "Municipality", y = "Persons evacuated to Municipal Shelters",
#       title = "Municipal Shelters used after Earthquake")
```


Then, we calculated the total number of people in households that were specifically asked to evacuate.

```{r,  warning = FALSE}
# Total people specifically asked to evacuate
evac = read_xlsx("disaster_data.xlsx", sheet = "evacuation_orders") %>%
  rename(muni = municipality) %>%
  left_join(y = muni, by = c("muni")) %>%
  group_by(muni, muni_code) %>%
  summarize(people_asked_to_evacuate = sum(target_people, na.rm = TRUE))
```


Finally, we joined these together with our *quake_muni_data.csv*.
```{r,  warning = FALSE, message = FALSE, warnings = FALSE}
# Merge into our quake data




read_csv("quake_muni_data.csv") %>% 
  # Merge in each kind of data
  left_join(by= "muni_code",
            y = muni %>%
              left_join(y = damages, by = c("muni_code", "muni")) %>%
              left_join(y = shelters, by = c("muni_code", "muni")) %>%
              left_join(y = water, by = c("muni_code", "muni")) %>%
              left_join(y = evac, by = c("muni_code", "muni")) %>%
              left_join(y = com, by = c("muni_code", "muni")) %>%
              select(-muni)) %>%
  left_join(by = "muni_code",
            y = people %>%
              select(-muni, -source) %>%
              mutate(date = date %>% str_replace_all(pattern = "[-]", replacement = "_")) %>%
              pivot_wider(id_cols = muni_code,
                          names_from = date,
                          names_prefix = "evacuees_",
                          values_from = evacuees)) %>%
  
  # Now if any of the variables we just imported have NAs,
  # it's because the other towns outside of Hokkaido were unaffected.
  # So let's recode only those variables with 0.
  mutate_at(vars(
    deaths, evacuation_advisory, evacuation_order_urgent,
    homes_damaged, injured, nonresidential_damaged,
    shelters_opened,
    evacuees_2018_09_10,
    evacuees_2018_09_17,
    evacuees_2018_09_24,
    evacuees_2018_10_01,
    evacuees_2018_10_08,
    outage_length_days,
    outages_peak_per_capita, outages_current_per_capita,
    people_asked_to_evacuate, households_disconnected_per_capita),
    funs(if_else(is.na(.), 0, as.numeric(.)))) %>%
  # Let's resave our new file over top of the old one.
  write_csv("quake_muni_data.csv")

remove(evac, muni, shelters, water, com, damages)
```


## 1.3 Calculating geographic distance from epicenter

Next, we need to calculate how far each city that Facebook observed was from the epicenter.
To do so, we use a shapefile of municipal political boundaries from the Ministry of Land, Infrastructure, and Transportation. We aggregate its boundaries to the district level, combining any municipalities into districts that apply, and then calculate district centroids using an East Asian Albers Equal Area Conic projection. Then, using an East Asian Albers Equal Distance Conic projection, we calculate the euclidean distance in kilometers between the epicenter and each district centroid. 

First, we gather the data necessary for each projection.
```{r,  warning = FALSE, eval = FALSE}
# This data uses the North East Asia Albers Equal Distance Conic Projection
eqdc = "+proj=eqdc +lat_0=0 +lon_0=0 +lat_1=15 +lat_2=65 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs"

# and also the North East Asia Albers Equal Area Conic Project
eqac = "+proj=aea +lat_1=15 +lat_2=65 +lat_0=30 +lon_0=95 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs"

```


Second,  we create a shapefile aggregated to the district level
```{r,  warning = FALSE, eval=FALSE}
# Take Japan municipal shapefile and aggregate it to the district level,
# to match our facebook IDs.
library(tidyverse)
library(sf)
library(rgdal)

# First, let's read in our shapefile of municipal political boundaries
jp_area = read_sf("japan_shapefile/polbnda_jpn.shp") %>%
  # Transform to Asia North Albers Equal Area Conic projection
  # Obtained here: https://spatialreference.org/ref/esri/102025/
  st_transform(
    CRS(eqac)) %>%
  # Wherever the adm_code is the same as the area_code in our dataset of unique facebook IDs,
  # merge in their information
  # Add in unique district ids, which have replaced adm codes with special facebook ids
  # whenever Facebook provided us with district data instead of municipal data
  left_join(by = c("adm_code" = "area_code"),
            y = read_csv("facebook_unique_ids.csv") %>% 
              select(id_facebook, area_code)) %>%
  # Give every object a unique facebook id, 
  # drawing from adm_code when there is no facebook id available.
  mutate(id_facebook = if_else(!is.na(id_facebook), id_facebook, adm_code)) %>%
  # Keep only cases with a valid facebook ID
  filter(!is.na(id_facebook)) %>%
  # Now bind together polygons with the same id_facebook into single polygons
  group_by(id_facebook) %>%
  summarize(geometry = st_union(geometry))

save(jp_area, file = "jp_area.RData")
```


Third, we calculate district centroids and generate the epicenter point, and then transform them to an East Asian Equal Distance Conic Projection for the next step.

```{r,  warning = FALSE, eval=FALSE}
# Using our Japanese districts data.frames, 
# calculate centroid point data for each district
centroids = jp_area %>%
  group_by(id_facebook) %>%
  summarize(centroid = st_centroid(geometry)) %>% ungroup %>%
  # Converts centroids to Equal Distance Conic Projection
  st_transform(crs = eqdc)

save(centroids, file = "centroids.RData")

# Now import epicenter location as a kmz file
epi = st_read("epicenter.kmz") %>% 
  # Converts centroids to Equal Distance Conic Projection
  st_transform(eqdc)

save(epi, file = "epicenter.Rdata")
```

Fourth, we calculate the distance between each district and the epicenter.

```{r,  warning = FALSE, eval=FALSE}
# Calculate euclidean distance from epicenter

# For each centroid
centroids %>%
  # Calculate euclidean distance
  st_distance(
    # from the epicenter
    y = epi$geometry,
    by_element = TRUE, which = "Euclidean") %>%
  as.data.frame() %>%
  magrittr::set_colnames("km") %>%
  mutate(id_facebook = centroids$id_facebook,
         km = as.numeric(km) / 1000) %>%
  write.csv(file = "epicenter_distances.csv", row.names = FALSE)

```

Fifth, we save all of them for future use, and then get rid of their data.

```{r,  warning = FALSE, eval=FALSE}
# Save geospatial data
save(
  list = c("jp_area", "centroids", "epi", "eqac", "eqdc"), 
  file = "geo_data.RData")
```

```{r,  warning = FALSE, eval = FALSE, warning = FALSE, message = FALSE}
remove(jp_area, centroids, epi, eqac, eqdc)
```


## 1.4 Aggregate from Municipal to District level for Predictor Data

We face one more tricky task. In order to assess evacuation in the same terms that Facebook did, we need to use the same jurisdictions. However, Facebook measures evacuation at the district level - an in-between level that treats cities and groups of towns the same - as opposed to the Japanese census's method of distinguishing jurisdictions, which treats cities, towns, villages, and urban wards all as independent units. So, we're going to aggregate up to the district level, summing or averaging any variables that need to be joined.


    
```{r,  warning = FALSE, message = FALSE, warning = FALSE}
# However, our outcome data occurs at the district level, not the municipal level. That is to say, many cities are districts themselves, but often small towns get grouped together into districts. The Japanese census does not collect district level data; so we have to average and aggregate to get to that level.

# To get around this, we created a map to translate Hokkaido municipality codes into Facebook IDs 

read_csv("quake_muni_data.csv") %>%
    # Join the map/thesaurus to our municipal data
    left_join(by = c("muni_code" = "area_code"),
            y = read_csv("facebook_unique_ids.csv") %>%
              select(id_facebook, area_code)) %>%
    # If a special facebook area code got joined in, use that. 
  # Otherwise, use a municipality code.
    mutate(id_facebook = if_else(!is.na(id_facebook), id_facebook, muni_code)) %>%
  group_by(id_facebook) %>%
  summarize(
    # Population
    pop_2015 = sum(pop_2015),
    # Raw disaster data
    deaths_per_capita = sum(deaths) / mean(pop_2015),
    injured_per_capita = sum(injured) / mean(pop_2015),
    buildings_damaged_per_capita = (sum(homes_damaged) + sum(nonresidential_damaged)) / mean(pop_2015),
    homes_damaged_per_capita = sum(homes_damaged) / mean(pop_2015),
    nonresidential_damaged_per_capita = sum(nonresidential_damaged) / mean(pop_2015),
    shelters_opened_per_capita = sum(shelters_opened) / mean(pop_2015),
#    evacuees_per_capita = sum(evacuees) / mean(pop_2015),
    evacuees_per_capita_2018_09_10 = sum(evacuees_2018_09_10) / mean(pop_2015),
    evacuees_per_capita_2018_09_17 = sum(evacuees_2018_09_17) / mean(pop_2015),
    evacuees_per_capita_2018_09_24 = sum(evacuees_2018_09_24) / mean(pop_2015),
    evacuees_per_capita_2018_10_01 = sum(evacuees_2018_10_01) / mean(pop_2015),
    evacuees_per_capita_2018_10_08 = sum(evacuees_2018_10_08) / mean(pop_2015),

    water_outage_length_days = mean(outage_length_days),
    water_outages_peak_per_capita = mean(outages_peak_per_capita),
    water_outages_current_per_capita = mean(outages_current_per_capita),
    people_asked_to_evacuate_per_capita = sum(people_asked_to_evacuate) / mean(pop_2015),
    households_disconnected_per_capita = mean(people_asked_to_evacuate),
    # Capacity data
    rev_to_exp_ratio_2016 = mean(rev_to_exp_ratio_2016),
    financial_strength_index_2016 = mean(financial_strength_index_2016),
    # Emergency Public Goods Data
    shelters_per_capita_2017 = mean(shelters_per_capita_2017),
    exp_fire_per_capita_2016 = mean(exp_fire_per_capita_2016),
    exp_public_works_per_capita_2016 = mean(exp_public_works_per_capita_2016),
    exp_dis_relief_per_capita_2016 = mean(exp_dis_relief_per_capita_2016),
    # Social Capital and Vulnerability Data
    social = mean(social_capital),
    bonding = mean(bonding),
    bridging = mean(bridging),
    linking = mean(linking),
    vulnerability = mean(vulnerability)) %>%
  write.csv(file = "quake_district_data.csv", row.names = FALSE)
```


## 1.5 Generate Facebook District Data for Home Cities affected by Quake

First, we must bind together and format our Facebook data.

```{r,  warning = FALSE, echo = FALSE, message = FALSE}
# Combine displacement files and edit
bind_rows(
  read_csv("the_earthquake_in_hokkaido_japan_japan_displacement_08_27.csv"),
  read_csv("the_earthquake_in_hokkaido_japan_japan_displacement_09_03.csv"),
  read_csv("the_earthquake_in_hokkaido_japan_japan_displacement_09_10.csv"),
  read_csv("the_earthquake_in_hokkaido_japan_japan_displacement_09_17.csv"),
  read_csv("the_earthquake_in_hokkaido_japan_japan_displacement_09_24.csv"),
  read_csv("the_earthquake_in_hokkaido_japan_japan_displacement_10_01.csv"),
  read_csv("the_earthquake_in_hokkaido_japan_japan_displacement_10_08.csv")
) %>% 

# create placeholders to receive values from subsequent for loop

  # Edit the Geometry object
  mutate(Geometry = Geometry %>% 
           str_remove("POINT ") %>%
           str_remove_all("[(]|[)]")) %>%
  # split geometry into the fields longitude and latitude
  separate(Geometry, sep = " ", c("long", "lat")) %>%
  
  # Now replace Hiro-gun with correct coordinates
  mutate(long = if_else(`City Name` == "hiro-gun", 143.3117, as.numeric(long)),
         lat = if_else(`City Name` == "hiro-gun", 42.2860, as.numeric(lat))) %>% 
  # Join in unique ID information
  left_join(y = read_csv("facebook_unique_ids.csv"),
            by = c("City Name" = "id_facebook")) %>%
  # Export
  write.csv(file = "disp.csv", row.names = FALSE)
```


Next, we bind into that data our district-level predictors.

```{r,  warning = FALSE, message=FALSE, warning=FALSE, include=FALSE}

dat = read_csv("disp.csv") %>%
  # look only at weeks after the disaster, and cities hit by disaster
  filter(`Is Home City` == TRUE & `Ds` >= "2018-09-06") %>%
  # select and rename variables
  select(id_facebook = `City Name`, date = `Ds`, 
         num_people_diff = `Number of People Different`, 
         displaced_from_here = `Total Displaced from Here`, 
         lat, long, area_code) %>%
  # join in predictor data (excluding number of evacuees)
  left_join(y = read_csv("quake_district_data.csv") %>%
              select(-c(evacuees_per_capita_2018_09_10,
                        evacuees_per_capita_2018_09_17,
                        evacuees_per_capita_2018_09_24,
                        evacuees_per_capita_2018_10_01,
                        evacuees_per_capita_2018_10_08)),
            by = "id_facebook") %>%
  left_join(y = read_csv("epicenter_distances.csv"),
            by = "id_facebook") %>%
  # join in  number of evacuees)
  left_join(
    y =   read_csv("quake_district_data.csv") %>%
              select(id_facebook, 
                     evacuees_per_capita_2018_09_10,
                     evacuees_per_capita_2018_09_17,
                        evacuees_per_capita_2018_09_24,
                        evacuees_per_capita_2018_10_01,
                        evacuees_per_capita_2018_10_08) %>%
    pivot_longer(cols = -c(id_facebook),
                 names_to = "date",names_prefix = "evacuees_per_capita_",
                 values_to = "evacuees_per_capita") %>%
    mutate(date = date %>% str_replace_all(pattern = "[_]", replacement = "-") %>% as.Date()),
    by = c("id_facebook", "date")) %>%
  # Convert date to a factor
  mutate(date = factor(date)) %>%
  write_csv("facebook_data.csv")
# Number of People Different tells us
# how many more/fewer users are there in this city,
# relative to the previous week,
# looking only at people who were in disaster region prior to the crisis.
```

Finally, since that data is negative in all cases except 3, we cut off those three cases, turn it positive, and measure it as a count variable, saving this new data as our *model_dataset.csv.*

```{r,  warning = FALSE, message=FALSE, warning=FALSE}
# Assess just evacuation as a negative binomial model
dat.count = dat %>%
  # Grab only cases that saw net-negative movement (evacuation)
  filter(num_people_diff < 0) %>%
  # turn those numbers into positive counts
  mutate(num_people_diff = abs(num_people_diff))  %>%
  # Create a population controlled dependent variable
  mutate(num_people_diff_per_capita = num_people_diff / pop_2015) %>%
  # convert date to factor
  mutate(date = factor(date))


# Let's write dat.count to file
write_csv(dat.count, path = "model_dataset.csv")
```


To demonstrate the problem, we visualize how there is considerable variation in evacuation outcomes.

```{r,  warning = FALSE, warning = FALSE, message = FALSE}
# Why is this a legitimate choice? Because only three city-weeks in this data set saw increases in movement to them. 
library(gridExtra)
library(viridis)

p1 = dat.count %>%
  ggplot(mapping = aes(x = num_people_diff)) +
  geom_histogram(aes(y = ..density..), fill = "firebrick", color = "white", bin = 50) +
  geom_density(fill = "firebrick", color = "white", alpha = 0.25) +
  xlim(0, 2000) + 
  theme_bw() +
  labs(x = "Estimated Evacuation out of Cities", y = "% of City-Weeks)",
       title = "Variation in Evacuation (September 6 - October 10)")

p2 = dat.count %>%
  ggplot(mapping = aes(x = num_people_diff, fill = date)) +
  geom_density(color = "white", alpha = 0.25) +
  xlim(0, 2000) + 
#  scale_fill_viridis(option = "viridis", discrete = TRUE) +
  theme_bw() +
  labs(x = "Estimated Evacuation out of Cities", y = "% of City-Weeks",
       caption = "Truncated at 2,000 people; excludes 5 outliers where 22,447 - 46,620 persons moved.",
       title = "Evacuation by Week",
       fill = "Week")

# Visualize
gridExtra::grid.arrange(p1, p2)
```

```{r,  warning = FALSE}
remove(p1, p2, dat)
```


# 2. Basic Model

Now that we have gathered our outcome and predictor data, we can import it into R for analysis.

```{r,  warning = FALSE, message=FALSE, warning = FALSE}
library(tidyverse)
library(Zelig)
dat.count = read_csv("model_dataset.csv") %>%
  mutate(date = factor(date)) %>%
   # recode also date into a numeric variable indicating days after the earthquake
  mutate(time = date %>% recode(
    "2018-09-10" = 5,
    "2018-09-17" = 12,
    "2018-09-24" = 19,
    "2018-10-01" = 26,
    "2018-10-08" = 33))
```

## 2.1 Log-transformed OLS Model

```{r,  warning = FALSE}
# Log transform the dependent variable and run an OLS model

m.lm <- dat.count %>%
  mutate_at(vars(-c("id_facebook", "date", 
                    "lat", "long", "area_code",
                    "num_people_diff", "num_people_diff_per_capita")), scale) %>%
 zelig(formula = log(num_people_diff_per_capita + 0.0000000001) ~ km +
          bonding + 
          bridging + 
          linking +
          vulnerability +
          exp_public_works_per_capita_2016 +
          exp_fire_per_capita_2016 +
          shelters_opened_per_capita +
          water_outage_length_days +
          buildings_damaged_per_capita +
          evacuees_per_capita +
          rev_to_exp_ratio_2016 +
          date, model = "ls")

# The following variables could not be fit due to multicollinearity
#  shelters_per_capita_2017
#  people_asked_to_evacuate_per_capita
#  water_outages_peak_per_capita
#  financial_strength_index_2016
#  households_disconnected_per_capita
#  We do not include households per capita because it is correlated (r = .99) with buildings per capita damaged, which is a more vital concept to control for to ensure the validity of the model.

# Statistically significant F test of model significance
# And Adjusted R2 of 0.40
m.lm %>% from_zelig_model() %>% 
  summary()
```

```{r,  warning = FALSE, echo = FALSE}
m.lm %>%
  from_zelig_model() %>%
  car::vif()
```




## 2.2 Poisson & Negative Binomial Models

```{r,  warning = FALSE, include = FALSE, echo = FALSE}
# Run Poisson
m.p = dat.count %>%
  # rescale predictors (but not outcome, since it's a count and needs to stay that way)
  mutate_at(vars(-c("id_facebook", "date", 
                    "lat", "long", "area_code",
                    "num_people_diff", "num_people_diff_per_capita")), scale) %>%
  zelig(formula = num_people_diff ~ km +
          pop_2015 +
          bonding + 
          bridging + 
          linking +
          vulnerability +
          exp_public_works_per_capita_2016 +
          exp_fire_per_capita_2016 +
          shelters_opened_per_capita +
          water_outage_length_days +
          buildings_damaged_per_capita +
          evacuees_per_capita +
          rev_to_exp_ratio_2016 +
        date,
      model = "poisson")

# Run Negative Binomial
m.nb = dat.count %>%
  # rescale predictors (but not outcome, since it's a count and needs to stay that way)
  mutate_at(vars(-c("id_facebook", "date", 
                    "lat", "long", "area_code",
                    "num_people_diff", "num_people_diff_per_capita")), scale) %>%
    zelig(formula = num_people_diff ~ km +
            pop_2015 +
          bonding + 
          bridging + 
          linking +
          vulnerability +
          exp_public_works_per_capita_2016 +
          exp_fire_per_capita_2016 +
          shelters_opened_per_capita +
          water_outage_length_days +
          buildings_damaged_per_capita +
          evacuees_per_capita +
          rev_to_exp_ratio_2016 +
        date,
        model = "negbin")

# Diagnostics

# Negative binomial models assume the conditional means are not equal to the conditional variances. We can run a likelihood ratio test to compare the fit of Poisson and Negative Binomial models.
data.frame(
  nb.logLik = m.nb %>% from_zelig_model() %>% logLik,
  p.logLik = m.p %>% from_zelig_model() %>% logLik) %>%
  mutate(pchisq = pchisq(2 * (nb.logLik - p.logLik), df = 1, lower.tail = FALSE))
# The p-value of approximately 0 indicates that the negative binomial is a much better fit.
```

```{r,  warning = FALSE, include = FALSE, echo = FALSE}
# We also apply the vuong test to see if the negative binomial model is a better fit than the poisson. 
pscl::vuong(
  m.nb %>% from_zelig_model(), 
  m.p %>% from_zelig_model()) %>% 
  as.data.frame()
# It appears it is quite better.
```


```{r,  warning = FALSE, include = FALSE, echo = FALSE}
m.nb %>%
  from_zelig_model() %>%
  BaylorEdPsych::PseudoR2()
# Why is Nagelkerke's R2 so high? McFadden's isn't.
# The addition of population to a simple distance model causes
# Cox.Snell & Nagelkerke's R2 to jump from ~0.2 to ~0.99.
# That's because they are literally correlated with a pearson's r of 0.97.
# That's because population *does* explain most here.
# But because our sample size is large enough, we can explain more.
# But it appears that our model isn't broken;
# There's no colinearity;
# We've just explained most variation by population,
# and begun to explain remaining variation using additional variables.
```


```{r,  warning = FALSE, include = FALSE, echo = FALSE}
# Test independent correlation between population and evacuation
dat.count %>%
  select(num_people_diff, pop_2015) %>%
  cor()
# Quite high indeed.
```



To verify these results, we *also* transformed the outcome into a per capita measure, allowing us to model effects on evacuation completely independent of population size. This created a right skewed continuous variable. To model this, we used a gamma log-link model and a log-transformed ordinary least squares (OLS) model. Both models were statistically significant at the p < 0.001 level and delivered comparable results. The gamma model produced a Cox-Snell's Psuedo R2 of 0.43, while the log-OLS model produced an adjusted R2 of 0.40. This indicates that with population removed from the equation, the factors assembled explain up to 40% of the variation observed in evacuation movement among Facebook users. Due to the flexibility, familiarity, and interpretability of OLS models, we report the OLS results in the paper.

## 2.3 Gamma Model

```{r,  warning = FALSE, include = FALSE, echo = FALSE}
# Estimate gamma model using a log-link function
m.g = dat.count %>%
  mutate_at(vars(-c("id_facebook", "date", 
                    "lat", "long", "area_code",
                    "num_people_diff", "num_people_diff_per_capita")), scale) %>%
 glm(formula = num_people_diff_per_capita ~ km +
          bonding + 
          bridging + 
          linking +
          vulnerability +
          exp_public_works_per_capita_2016 +
          exp_fire_per_capita_2016 +
          shelters_opened_per_capita +
          water_outage_length_days +
          buildings_damaged_per_capita +
          evacuees_per_capita +
          rev_to_exp_ratio_2016 +
       date,
      family = Gamma(link = "log"))

# Do a one-tailed test of model statistical significance
data.frame(
  null.deviance = m.g$null.deviance,
  residual.deviance = m.g$deviance,
  df.null = m.g$df.null,
  df.residual = m.g$df.residual) %>%
  mutate(p.value = 1 - pchisq(null.deviance - residual.deviance, df = df.null - df.residual))
```

```{r,  warning = FALSE, include = FALSE, echo = FALSE}
# Multicolinearity is low; only a few are above the gold standard of 2.5; none are anywhere near 10.
m.g %>%
  car::vif()
```

```{r,  warning = FALSE, include = FALSE, echo = FALSE}
m.g %>%
  BaylorEdPsych::PseudoR2()
# Pretty good Cox.Snell's R2 of 0.51;
# Nagelkerke of 0.86
```





```{r,  warning = FALSE, include = FALSE, echo = FALSE, echo = FALSE}
# This suggests that overall post-disaster movement patterns have very little to do with local evacuation. This is unsurprising, considering that they describe two different phenomenon. Their lack of correlation implies that high long-distance and high short-distance evacuation in our sample are unrelated. This may be because our Facebook data describes movement all across Hokkaido as a result of the disaster.

dat.count %>%
  select(
    num_people_diff,
    num_people_diff_per_capita, 
    evacuees_per_capita) %>%
  cor() %>% as.data.frame() 
```









# 3. Appendix Table: Goodness of Fit Statistics

```{r,  warning = FALSE, echo = FALSE, message=FALSE, warning=FALSE}
# Let's build a few helper functions to 
# get goodness of fit statistics across our different models.

# Calculate the Intercept test for a GLM
glm.intercept = function(model){
  return(
    data.frame(
      null.deviance = model$null.deviance,
      residual.deviance = model$deviance,
      df.null = model$df.null,
      df.residual = model$df.residual) %>%
      mutate(p.value = 1 - pchisq(null.deviance - residual.deviance, 
                                  df = df.null - df.residual)) %>% 
      select(p.value) %>% unlist()
  )
}

# Calculate Mean VIF for each model
meanvif = function(model){
  return(
    model %>% 
      # get vif table
      car::vif() %>% 
      as.data.frame() %>%
      # Grab third column, GVIF^1/(2*df)
      select(vif = 3) %>%
      # Square it 
      # This gives you numbers comparable
      # for categorical and numeric variables
      mutate(vif = vif^2) %>% 
      # Get mean
      unlist() %>% mean()
  )
}

# Get Cox-Snell PsuedoR2
cox.snell = function(model){
  return(
    model %>%
      BaylorEdPsych::PseudoR2() %>%
      as.data.frame() %>%
      slice(3) %>% unlist()
  )
}
```


```{r,  warning = FALSE, echo=TRUE, message=FALSE, warning=FALSE}
# Compile Goodness of Fit Statistics

gof = bind_rows(
  # OLS
  data.frame(
    model = "OLS",
    # Calculate F-statistic p-value
    intercept =  summary(
      m.lm %>%
        from_zelig_model())$fstatistic %>%
      t() %>%
      as.data.frame() %>%
      mutate(p.value = df(value, numdf, dendf)) %>%
      select(p.value) %>% unlist(),
    # Get R2
    R2 = summary(
      m.lm %>%
        from_zelig_model())$adj.r.squared,
    # Get Number of Observations
    N = m.lm %>%
      from_zelig_model %>%
      nobs(),
    # Get Mean Variance Inflation Factor
    meanvif = m.lm %>%
      from_zelig_model() %>%
      meanvif()
  ),
  
  # Gamma
  data.frame(
    model = "Gamma",
    intercept = m.g %>% glm.intercept(),
    R2 = m.g %>% cox.snell(),
    N = m.g  %>% nobs(),
    meanvif = m.g  %>% meanvif()
  ),
  
  # Negative Binomial
  data.frame(
    model = "Negative Binomial",
    intercept = m.nb %>% from_zelig_model() %>% glm.intercept(),
    R2 = m.nb %>% from_zelig_model() %>% cox.snell(),
    N = m.nb %>% from_zelig_model() %>% nobs(),
    meanvif = m.nb %>% from_zelig_model() %>% meanvif()
  ))


# Display model fit table
gof %>%
  mutate_at(vars(-"model"), funs(round(., 4))) %>%
  rename(
    `Model` = model,
    `R2 / Pseudo R2` = R2,
    `Model Fit p-value` = intercept,
    `Cases` = N,
    `Average VIF` = meanvif) %>%
  knitr::kable()
```




#4. Modeling


## 4.1 Model Table I:

```{r,  warning = FALSE, include = FALSE, echo = FALSE}

# Calculate robust standard errors and p.values
robust = function(data){
  return(  
    lmtest::coeftest(data, vcov = sandwich::vcovHC(data))
  )
}
# For OLS
r.lm = m.lm %>%
      from_zelig_model() %>%
      robust() %>% broom::tidy()
# For Gamma
r.g = m.g %>%
  robust() %>% broom::tidy()
# For Negative Binomial
r.nb = m.nb %>%
  from_zelig_model() %>%
  robust() %>% broom::tidy()
```


```{r,  warning = FALSE, results = "asis", echo = FALSE, warning = FALSE}
# Create a single table for all models

texreg::htmlreg(
  list(m.lm, m.g, m.nb),
  custom.model.names = c("OLS", "Gamma", "Negative <br> Binomial"),
  bold = 0.05,
  override.se = list(
    r.lm$std.error,
    r.g$std.error,
    r.nb$std.error),
  override.pvalues = list(
    # OLS robust p.values
    r.lm$p.value,
    r.g$p.value,
    r.nb$p.value),
  custom.coef.map = list(
    "bonding" = "Bonding", # Row 3
    "bridging" = "Bridging", # Row 4
    "linking" = "Linking", # Row 5
    "vulnerability" = "Social Vulnerability Index", # Row 6
    "rev_to_exp_ratio_2016" = "Revenues to Expenditures Ratio", # Row 13
    
    "exp_fire_per_capita_2016" = "Emergency Services Spending", # Row 7
    "exp_public_works_per_capita_2016" = "Public Works Spending", # Row 8
    "shelters_opened_per_capita" = "Shelters Opened per capita", # Row 9
    
    "km" = "Distance from Epicenter", # Row 2
    "buildings_damaged_per_capita" = "Buildings Damaged <br>    per capita", # Row 11
    "evacuees_per_capita" = "Evacuees to Local Shelters <br>    per capita", # Row 12
    "water_outage_length_days" = "Days of Water Outages", # Row 10 
    
    "pop_2015" = "Population", # Row 18
    "date2018-09-17" = "Week 2", # Row 14
    "date2018-09-24" = "Week 3", # Row 15
    "date2018-10-01" = "Week 4", # Row 16
    "date2018-10-08" = "Week 5", # Row 17
    "(Intercept)" = "Constant"# Row 1
  ),
  
  groups = list("<i>Social Capital Index</i>" = 1:3,
                "<i>Vulnerability</i>" = 4,
                "<i>Public Goods</i>" = 5:8,
                "<i>Controls<i/>" = 9:13,
                "<i>Fixed Effects<i/>"=14:17))


```

### Appendix Table 2: Distribution of Residuals

```{r,  warning = FALSE}
# Let's compare how normally distributed our residuals are across the three models.
library(tidyverse)
# Gather and label residuals from each model

mstat = bind_rows(
  data.frame(
    residuals = m.p %>% from_zelig_model() %>% residuals.glm() %>% scale,
    predicted = m.p %>% from_zelig_model() %>% fitted.values %>% scale(),
    model = "poisson"),
  data.frame(
    residuals = m.nb %>% from_zelig_model() %>% residuals.glm() %>% scale,
    predicted = m.nb %>% from_zelig_model() %>% fitted.values %>% scale,
    model = "negative binomial"),
  data.frame(
    residuals = m.g %>% residuals.glm() %>% scale,
    predicted = m.g %>% fitted.values %>% scale(),
    model = "gamma"),
  data.frame(
    residuals = m.lm %>% from_zelig_model() %>% residuals %>% scale,
    predicted = m.lm %>% from_zelig_model() %>% fitted.values %>% scale(),
    model = "log-OLS")) %>%
  # Code order in which models should appear
  mutate(model = factor(model, levels = c("poisson", "negative binomial", "gamma", "log-OLS")))
```


```{r,  warning = FALSE}
mstat %>%
  # Visualize density plot
  ggplot(mapping = aes(x = residuals, fill = model)) +
    
  geom_density(color = "white", bins = 30, alpha = 0.25) +
  geom_density(. %>% filter(model == "log-OLS"),
               mapping = aes(x = residuals), color = "red", linetype = "dashed", fill = NA) +
  # Plot a normal distribution over them
  stat_function(fun = dnorm) +
  # Label and format theme
  theme_minimal() +
  labs(x = "Standardized Residuals", y = "Frequency (% of Cases)",
       fill = "Model Type",
       title = "Which model's residuals are most normally distributed?",
       caption = "log-OLS model (dashed red line) looks acceptable.")
# In the end, the OLS model isn't terribly off.
```

Now let's check out whether it has non-constant variance.

```{r,  warning = FALSE}
mstat %>%
  ggplot(mapping = aes(x = predicted, y = residuals, color = model)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

# This chart suggests that a) OLS tends to have wider residuals, BUT
# OLS also has fewer crazy outliers.
# And, on the whole, it's pretty un-patterned, which is good news.
# As long as we use robust standard errors, it should be fine.
```


```{r,  warning = FALSE}
remove(r.lm, r.g, r.nb, m.p, m.lm, m.g, m.nb, gof)
```




Unfortunately, we can't add every interaction effect under the sun. (See end of the document, where we try.) Nothing is significant, because we only have ~200 cases to explain variation with. So, the interaction effects add way too many additional variables to our model for it to work properly.

Instead, let's try a reduced model.

```{r,  warning = FALSE}
write_csv(dat.count, "iburi_dataset.csv")

```


## 4.3 Model Table II: Temporal Interactions

### Models


```{r,  warning = FALSE, message = FALSE, warning = FALSE}
# Basic Model
m1 <- dat.count %>%
  mutate_at(vars(-c("id_facebook", "date", 
                    "lat", "long", "area_code",
                    "num_people_diff", "num_people_diff_per_capita",
                    "bonding", "bridging", "linking", "vulnerability", "time")), funs(. %>% scale() %>% as.numeric)) %>%
 zelig(formula = log(num_people_diff_per_capita + 0.0000000001) ~ km +
         # Indices        
         time + bonding + bridging + linking + vulnerability +
         # Controls
          exp_public_works_per_capita_2016 +
          exp_fire_per_capita_2016 +
          shelters_opened_per_capita +
          water_outage_length_days +
          buildings_damaged_per_capita +
          evacuees_per_capita +
          rev_to_exp_ratio_2016,
       model = "ls")

# Interaction Effect of Bonding & Bridging
m2 <- dat.count %>%
  mutate_at(vars(-c("id_facebook", "date", 
                    "lat", "long", "area_code",
                    "num_people_diff", "num_people_diff_per_capita",
                    "bonding", "bridging", "linking", "vulnerability", "time")), funs(. %>% scale() %>% as.numeric)) %>%
 zelig(formula = log(num_people_diff_per_capita + 0.0000000001) ~ km +
        # Indices
        time + bonding + bridging + linking + vulnerability +
        # Interaction
        time*linking +
          # Controls
          exp_public_works_per_capita_2016 +
          exp_fire_per_capita_2016 +
          shelters_opened_per_capita +
          water_outage_length_days +
          buildings_damaged_per_capita +
          evacuees_per_capita +
          rev_to_exp_ratio_2016, 
       model = "ls")

# Interaction Effect for Bonding, Bridging, & Linking
m3 <- dat.count %>%
  mutate_at(vars(-c("id_facebook", "date", 
                    "lat", "long", "area_code",
                    "num_people_diff", "num_people_diff_per_capita",
                    "bonding", "bridging", "linking", "vulnerability")), funs(. %>% scale() %>% as.numeric)) %>%
 zelig(formula = log(num_people_diff_per_capita + 0.0000000001) ~ km +
         # Indices
        time + bonding + bridging + linking + vulnerability +
         # Interaction
         time * linking * bridging +
         # controls
          exp_public_works_per_capita_2016 +
          exp_fire_per_capita_2016 +
          shelters_opened_per_capita +
          water_outage_length_days +
          buildings_damaged_per_capita +
          evacuees_per_capita +
          rev_to_exp_ratio_2016, 
       model = "ls")

# Interaction Effect for Bonding, Bridging, Linking, & Vulnerability
m4 <- dat.count %>%
  mutate_at(vars(-c("id_facebook", "date", 
                    "lat", "long", "area_code",
                    "num_people_diff", "num_people_diff_per_capita",
                    "bonding", "bridging", "linking", "vulnerability")), funs(. %>% scale() %>% as.numeric)) %>%
 zelig(formula = log(num_people_diff_per_capita + 0.0000000001) ~ km +
         # Indices
        time + bonding + bridging + linking + vulnerability +
         # Interaction
         time * linking * bridging * vulnerability +
         # controls
          exp_public_works_per_capita_2016 +
          exp_fire_per_capita_2016 +
          shelters_opened_per_capita +
          water_outage_length_days +
          buildings_damaged_per_capita +
          evacuees_per_capita +
          rev_to_exp_ratio_2016,
       model = "ls")





robust = function(data){
  return(  
    lmtest::coeftest(data, vcov = sandwich::vcovHC(data)) %>%
      broom::tidy()
  )
}

# Get robust standard errors
r1 <- m1 %>% from_zelig_model() %>% robust()
r2 <- m2 %>% from_zelig_model() %>% robust()
r3 <- m3 %>% from_zelig_model() %>% robust()
r4 <- m4 %>% from_zelig_model() %>% robust()
```


### Visualize

```{r,  warning = FALSE}
# Create new fake data.frame over time
newdata = dat.count %>%
  with(expr = data.frame(time = time %>% mean(),
                         bonding = bonding %>% mean(),
                         bridging = bridging %>% mean(), 
                         linking = seq(from = 0.0, to = 1, length.out = 100),
                         vulnerability = vulnerability %>% mean(),
                         km = 0,
                         exp_public_works_per_capita_2016 = 0, 
                         exp_fire_per_capita_2016 = 0, 
                         shelters_opened_per_capita = 0,
                         water_outage_length_days = 0,
                         buildings_damaged_per_capita = 0,
                         evacuees_per_capita  = 0,
                         rev_to_exp_ratio_2016 = 0))
result <- predict(m1, newdata = newdata, se.fit = TRUE, type = "response") %>%
  as.data.frame() %>%
  bind_cols(newdata) %>%
  select(fit, se.fit, linking)

# Make one with linking and bridging

# Please give me a


bind_rows(
  result %>%
    mutate(lower = fit - se.fit*2.58,
           upper = fit + se.fit*2.58,
           type = "99%"),
  result %>%
    mutate(lower = fit - se.fit*1.96,
           upper = fit + se.fit*1.96,
           type = "95%"),
  result %>%
    mutate(lower = fit - se.fit*1.64,
           upper = fit + se.fit*1.64,
           type = "90%")
) %>%
  ggplot(mapping = aes(x = linking, y = fit, ymin = lower, ymax = upper, group = type)) +
  geom_ribbon(alpha = 0.5, fill = "steelblue") +
  geom_line(linetype = "dashed") +
  theme_minimal() +
  labs(x = "Linking Social Capital Index (0-1)",
       y = "Predicted Log-Evacuation Rate",
       title = "Linking Social Capital Reduces Long-Distance Evacuation",
       caption = "(Bands depict 99, 95, and 90% confidence intervals)") +
  theme(plot.title = element_text(hjust = 0.5),
        plot.caption = element_text(hjust = 0.5))

```



### Table

```{r,  warning = FALSE, results = "asis", echo = FALSE}

# Create a single table for all models
texreg::htmlreg(
  list(m1, m2, m3, m4),
  stars = c(0.001, 0.01, 0.05, 0.10),
  custom.model.names = c("Basic","Over Time", 
                         "With Bridging", "with Vulnerability"),
  bold = 0.05,
  override.se = list(
    r1$std.error,
    r2$std.error,
    r3$std.error,
    r4$std.error),
  override.pvalues = list(
    # OLS robust p.values
    r1$p.value,
    r2$p.value,
    r3$p.value,
    r4$p.value),
  custom.coef.map = list(
    # Indices (1-4)
    "bonding" = "Bonding Social Capital", # Row 3
    "bridging" = "Bridging Social Capital", # Row 4
    "linking" = "Linking Social Capital", # Row 5
    "vulnerability" = "Social Vulnerability Index", # Row 6
    # Basic Interactions (5-8)
    "bridging:linking" = "Linking x Bridging",
    "linking:vulnerability" = "Linking x Vulnerability",
    "bridging:vulnerability" = "Bridging x Vulnerability",
    "bridging:linking:vulnerability" = "Linking x Bridging x Vulnerability",
    # Time Interactions
    "time" = "Time",
    "time:linking" = "Time x Linking",
    "time:bridging" = "Time x Bridging",
    "time:vulnerability" = "Time x Vulnerability",
    "time:bridging:linking" = "Time x Linking x Bridging",
    "time:linking:vulnerability" = "Time x Linking x Vulnerability",
    "time:bridging:vulnerability" = "Time x Bridging x Vulnerability",
    "time:bridging:linking:vulnerability" = "Time x Linking x Bridging x Vulnerability",
    # Controls (16 - 24)
    "rev_to_exp_ratio_2016" = "Revenues to Expenditures Ratio", # Row 13
    
    "exp_fire_per_capita_2016" = "Emergency Services Spending", # Row 7
    "exp_public_works_per_capita_2016" = "Public Works Spending", # Row 8
    "shelters_opened_per_capita" = "Shelters Opened per capita", # Row 9
    
    "km" = "Distance from Epicenter", # Row 2
    "buildings_damaged_per_capita" = "Buildings Damaged <br>    per capita", # Row 11
    "evacuees_per_capita" = "Evacuees to Local Shelters <br>    per capita", # Row 12
    "water_outage_length_days" = "Days of Water Outages", # Row 10 
    
    "pop_2015" = "Population", # Row 18
    "(Intercept)" = "Constant"# Row 1
  ),
  groups = list(
    "<i>Indices</i>" = 1:4,
    "<i>Basic Interactions</i>" = 5:8,
    "<i>Temporal Interactions</i>" = 9:16,
    "<i>Public Goods</i>" = 17:20,
    "<i>Controls<i/>" = 21:25),
  caption = "*** signifies statistical significance at the p < 0.001 level, ** at p < 0.01, * at p < 0.05, and . at p < 0.10."
  )


car::vif(m1 %>% from_zelig_model())
```








## Figure 2: Simulated Effects

```{r,  warning = FALSE, fig.height=6, eval = FALSE}



# Redo the model, so that it is no longer scaled.
# We needed scaled effects for the table, but we want interpretable variables
# for this chart
m = dat.count %>%
 zelig(formula = log(num_people_diff_per_capita + 0.0000000001) ~ km +
          bonding + 
          bridging + 
          linking +
          vulnerability +
          exp_public_works_per_capita_2016 +
          exp_fire_per_capita_2016 +
          shelters_opened_per_capita +
          water_outage_length_days +
          buildings_damaged_per_capita +
          evacuees_per_capita +
          rev_to_exp_ratio_2016 +
          date, model = "ls")

summary(m)
```

Using the zelig package, let's simulate the expected log of evacuees per capita based on our model, and exponentiate the result, and visualize it. We produce simulated values by varying the inputs for specific variables.
```{r,  warning = FALSE, eval = FALSE}

simulated = bind_rows(
  # Simulate expected values given linking capital changes
  m %>%
    setx(linking = seq(0.0, 1.0, by = 0.05)) %>%
    sim() %>%
    zelig_qi_to_df() %>%
    qi_slimmer() %>%
    rename(value = linking) %>%
    mutate(variable = "Linking Social Capital"),
  # Simulate expected values given vulnerability changes
  m %>%
    setx(vulnerability = seq(0.0, 1.0, by = 0.05)) %>%
    sim() %>%
    zelig_qi_to_df() %>%
    qi_slimmer() %>%
    rename(value = vulnerability) %>%
    mutate(variable = "Social Vulnerability"),
  
  # Simulate expected values given emergency services changes
  m %>%
    setx(exp_fire_per_capita_2016 = seq(0.0, 150, by = 5)) %>%
    sim() %>%
    zelig_qi_to_df() %>%
    qi_slimmer() %>%
    rename(value = exp_fire_per_capita_2016) %>%
    mutate(variable = "Emergency Services"),
  # Simulate expected values given shelters changes
  m %>%
    setx(shelters_opened_per_capita = seq(0, 10, by = .5)/1000) %>%
    sim() %>%
    zelig_qi_to_df() %>%
    qi_slimmer() %>%
    rename(value = shelters_opened_per_capita) %>%
    mutate(variable = "Shelters")
) 



```


```{r,  warning = FALSE, eval = FALSE}
simulated %>%
  mutate(value = if_else(variable == "Shelters", value*1000, value)) %>%
  mutate(variable = variable %>% recode_factor(
    "Linking Social Capital" = "Linking \n Social Capital", 
    "Social Vulnerability" = "Social \n Vulnerability",
    "Emergency Services" = "Emergency Services \n (thousands of yen \n per capita)",
    "Shelters" = "Shelters \n per thousand persons")) %>%
  ggplot(mapping = aes(x = value, y = exp(qi_ci_median)*1000,
                       ymin = exp(qi_ci_min)*1000, ymax = exp(qi_ci_max)*1000,
                       group = variable, fill = variable)) +
  geom_ribbon(alpha = 0.5) +
  geom_line() +
  scale_fill_viridis(option = "plasma", discrete = TRUE) +
  scale_y_continuous(trans = "log10")  +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  facet_wrap(~variable, scales = "free", nrow = 2) +
  guides(fill = FALSE) +
  labs(x = "", 
       y = "Expected Evacuation \n per thousand persons",
       caption = "Bands show simulated 95% confidence intervals. Y-axes log-scaled.",
       title = "Drivers of Evacuation")
```

```{r,  warning = FALSE, eval = FALSE}
remove(m, simulated)
```




```{r,  warning = FALSE, eval = FALSE}
remove(m.lm1, m.lm2, m.lm3, m.lm4,
       r.lm1, r.lm2, r.lm3, r.lm4)
```







# 3. Network Visualization

```{r,  warning = FALSE, message=FALSE,warning=FALSE, include=FALSE}
bind_rows(
  # Bind together into edgelist
  read_csv("disp.csv") %>%
    select(from = `Top Source City`, to = `City Name`, 
           weight = `Top Source Proportion`, date = `Ds`),
  read_csv("disp.csv") %>%
    select(from = `Second Source City`,  to = `City Name`,
           weight = `Second Source Proportion`, date = `Ds`),
  read_csv("disp.csv") %>%
    select(from = `Third Source City`, to = `City Name`, 
           weight = `Third Source Proportion`, date = `Ds`)) %>%
  # exclude NAs
  filter(!is.na(weight)) %>%
  write.csv("edgelist.csv", row.names = FALSE)
```

```{r,  warning = FALSE, warning = FALSE, message = FALSE}
library(tidygraph)
library(ggraph)
library(tidyverse)

# Import edgelist
g = read_csv("edgelist.csv") %>%
  # filter to after disaster
  filter(date > "2018-09-06") %>%
  # remove loops
  filter(from != to) %>%
  mutate(date = as.character(date)) %>%
  # convert dates from Date format into readable format
  mutate(date = date %>% dplyr::recode_factor(
    "2018-09-10" = "1",
    "2018-09-17" = "2",
    "2018-09-24" = "3",
    "2018-10-01" = "4",
    "2018-10-08" = "5")) %>%
  # Convert to tidy-graph
  as_tbl_graph() %>%
    activate(nodes) %>%
  # Join in prefecture data
  left_join(y = read_csv("facebook_unique_ids.csv") %>%
              select(id_facebook, prefecture) %>%
              distinct(),
            by = c("name" = "id_facebook")) %>%
  # Join in distance from epicenter
  left_join(y = read_csv("facebook_data.csv") %>%
  select(id_facebook, km) %>% distinct(),
  by = c("name" = "id_facebook")) %>%
  # Identify nodes in and outside of Hokkaido
  mutate(hokkaido = if_else(prefecture == "Hokkaido", "Hokkaido", "Other") %>% factor()) %>%
  # Calculate degree
  mutate(degree = centrality_degree(mode = "in", loops = FALSE))
```

#### Figure: Diversity of Destinations, but One Source
```{r,  warning = FALSE, warning = FALSE, message=FALSE}

# Visualize overall
viz1 = g %>%
  ggraph(layout = "kk") +
  geom_edge_diagonal(color = "gray", alpha = 0.2) +
  geom_node_point(aes(color = hokkaido, size = degree)) +
  geom_node_text(aes(label = if_else(
    name == "sapporo-shi", "Sapporo", NA_character_)), size = 5, family = "Arial") +
  theme_graph() +
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5),
        plot.caption = element_text(hjust = 0.5)) + 
  guides(size = FALSE) +
  labs(
    title = "Excess Movement from Sapporo \n Overall",
    color = "Prefecture",
    size = "In-Degree Centrality",
    caption = "Over 5 weeks, Sapporo was source \n for users going to 96 unique cities, \n and a destination for users from 2 unique cities \n Made using a Kamada-Kawai layout.")

```



#### Figure: Hokkaido Evacuation Sources and Destinations
```{r,  warning = FALSE}
# Visualize Hokkaido Locations
viz2 = g %>%
  activate(nodes) %>%
  filter(hokkaido == "Hokkaido") %>%
  mutate(name = paste(
    name %>% str_sub(1,1) %>% toupper(),
    name %>% str_sub(2), sep = "") %>%
      str_remove_all("-shi|-gun|-mura|furu-gun")) %>%
  ggraph(layout = "fabric", sort.by = node_rank_fabric()) +
  geom_edge_diagonal(color = "gray", alpha = 0.2) +
  geom_node_point(aes(size = degree, color = hokkaido)) +
  geom_node_text(aes(label = name), # if_else(degree >= 5, name, NA_character_)),
                 size = 2.5, family = "Arial") +
  theme_graph() + guides(color = FALSE) +
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5),
        plot.caption = element_text(hjust = 0.5)) +
  labs(
    title = "Excess Movement from Sapporo \n in Hokkaido",
    size = "In-Degree Centrality",
    caption = "Over 5 weeks, Hokkaido cities most often \n nominated Sapporo as their top source city. \n Made using a Fabric layout.")
```


```{r,  warning = FALSE}
viz1
```

```{r,  warning = FALSE}
viz2
```

```{r,  warning = FALSE, fig.width=10, fig.height=5, warning = FALSE}
gridExtra::grid.arrange(viz1, viz2,
             layout_matrix = t(matrix(c(1, 1,2, 2, 2))))
```


```{r,  warning = FALSE}
remove(viz1, viz2, g, dat)
```



